{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 103,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#top 5k dice keywords\n",
    "NUM_CLUSTERS         = 3000 # for 25k keywords and phrases\n",
    "# number of cluster synonyms to map to\n",
    "NUM_CLUSTER_SYNONYMS = 5\n",
    "KEY_WORDS_FILE       = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\"\n",
    "SYNONYMS_QRY_FILE    = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/cluster_keyword_synonym_qry.txt\"\n",
    "SYNONYMS_INDEX_FILE  = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/cluster_keyword_synonym_ix.txt\"\n",
    "PHRASES_FILE         = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/Phrases.txt\"\n",
    "MODEL_FILE           = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/keyword_model.w2v\"\n",
    "CLUSTERS_FILE        = \"/Users/simon.hughes/Documents/Dice Data/LuceneTalk/%i_clusters.txt\" % NUM_CLUSTERS"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "#Shared\n",
    "#just used to load phrases file\n",
    "def load_stop_words(stop_words_file):\n",
    "    stop_words = set()\n",
    "    with open(stop_words_file) as f:\n",
    "            for line in f:\n",
    "                word = line.strip()\n",
    "                if word[0] != \"#\":\n",
    "                    word = word.lower()\n",
    "                    stop_words.add(word)\n",
    "    return stop_words\n",
    "\n",
    "def get_vector(item, model):\n",
    "    vocab = model.vocab[item]\n",
    "    vector = model.syn0[vocab.index]\n",
    "    return vector\n",
    "\n",
    "def get_norm_vector(item, model):\n",
    "    if item not in model.vocab:\n",
    "        return None\n",
    "    # for deserialized models, the norm vectors are not stored\n",
    "    vec = get_vector(item, model)\n",
    "    norm = np.linalg.norm(vec)\n",
    "    if norm != 0:\n",
    "        return vec / norm\n",
    "    return vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import time\n",
    "grand_start = time.time()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 88,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from collections import defaultdict\n",
    "\n",
    "#functions\n",
    "def is_valid_search_keyword(kw):\n",
    "    q_kw = \" \" + kw + \" \"\n",
    "    for wd in \"(,), and , or , not , true , TRUE , false , FALSE \".split(\",\"):\n",
    "        if wd in q_kw:\n",
    "            return False\n",
    "    # remove queries with negations in them\n",
    "    tokens = kw.split(\" \")\n",
    "    \n",
    "    # remove single char keywords\n",
    "    if len(tokens) == 1 and len(tokens[0]) == 1:\n",
    "        return False\n",
    "    \n",
    "    if any(map(lambda t: t.strip().startswith(\"-\"), tokens)):\n",
    "        return False\n",
    "    return True\n",
    "\n",
    "def map_keyword(kw):\n",
    "    return kw.replace(\" \", \"_\")\n",
    "\n",
    "def extract_clusters(ids, id2kwd):\n",
    "    clusters = defaultdict(set)\n",
    "    for kw_id, label in enumerate(ids):\n",
    "        kw = id2kwd[kw_id]\n",
    "        clusters[label].add(kw)\n",
    "    return clusters\n",
    "\n",
    "def extract_centroids(km_clusterer):\n",
    "    lbl2centroid = dict()\n",
    "    for i in range(len(km_clusterer.cluster_centers_)):\n",
    "        centroid = km_clusterer.cluster_centers_[i]\n",
    "        c_norm = np.linalg.norm(centroid)\n",
    "        if c_norm > 0.0:\n",
    "            n_centroid = centroid / c_norm\n",
    "        else:\n",
    "            n_centroid = centroid\n",
    "        lbl2centroid[i] = n_centroid\n",
    "    return lbl2centroid\n",
    "\n",
    "def compute_cluster_similarities(kwds, kwd2id, vectors, lbl2centroid):\n",
    "    kwd2cluster_sims = dict()\n",
    "    for kwd in kwds:\n",
    "        ix = kwd2id[kwd]\n",
    "        nvec = vectors[ix]\n",
    "        sims = []\n",
    "\n",
    "        for lbl, centroid in lbl2centroid.items():\n",
    "            cosine_sim = np.inner(nvec, centroid)\n",
    "            sims.append((lbl,cosine_sim))\n",
    "        sims = sorted(sims, key = lambda (lbl,sim): -sim)\n",
    "        kwd2cluster_sims[kwd] = sims\n",
    "        if len(kwd2cluster_sims) % 1000 == 0:\n",
    "            print(\"%i computed out of %i\" % (len(kwd2cluster_sims), len(all_kwds)))\n",
    "    return kwd2cluster_sims\n",
    "\n",
    "# expand at query time\n",
    "# use with tfidf (on cluster labels) at index time by just mapping to cluster label\n",
    "def write_most_similar_clusters(topn, kwd2cluster_sims, synonym_qry_fname, synonyn_index_fname):\n",
    "    kwords = sorted(kwd2cluster_sims.keys())\n",
    "    cluster_label = lambda lbl: \"cluster_\" + str(lbl)\n",
    "    \n",
    "    with open(synonym_qry_fname, \"w+\") as qry_f:\n",
    "        for kword in kwords:\n",
    "            cl_sims = kwd2cluster_sims[kword]\n",
    "            # unlike the other methods, we DO want to include the first cluster here\n",
    "            # as it's a cluster rather than the top 10 or top 30 keyword method\n",
    "            top_clusters = cl_sims[:topn]                \n",
    "            if len(top_clusters) > 0:\n",
    "                qry_f.write(\"%s=>\" % kword)\n",
    "                for lbl, sim in top_clusters:                    \n",
    "                    qry_f.write(\"%s|%f \" %(cluster_label(lbl),sim))\n",
    "                qry_f.write(\"\\n\")\n",
    "                \n",
    "    with open(synonyn_index_fname, \"w+\") as f:\n",
    "        for kword in kwords:\n",
    "            # get top cluster label\n",
    "            lbl, sim = kwd2cluster_sims[kword][0]\n",
    "            f.write(\"%s=>%s\\n\" % (kword, cluster_label(lbl)))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import gensim, time\n",
    "from gensim.models.word2vec import Word2Vec\n",
    "\n",
    "model = Word2Vec.load(MODEL_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "24785"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "phrases = load_stop_words(PHRASES_FILE)\n",
    "len(phrases)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4709 keywords loaded from /Users/simon.hughes/Documents/Dice Data/LuceneTalk/top_5k_keywords.txt\n"
     ]
    }
   ],
   "source": [
    "keywords = []\n",
    "un_keywords = set()\n",
    "with open(KEY_WORDS_FILE) as f:\n",
    "    for line in f:\n",
    "        kw = line.strip()\n",
    "        if len(kw) > 0 and is_valid_search_keyword(kw):\n",
    "            keywords.append(kw)\n",
    "print(\"%i keywords loaded from %s\" % (len(keywords), KEY_WORDS_FILE))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 54,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(25189, 25189)"
      ]
     },
     "execution_count": 54,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#get all keywords\n",
    "# remove any not in the model\n",
    "all_kwds = phrases.union(keywords)\n",
    "#all_kwds = set(keywords)\n",
    "for kwd in list(all_kwds):\n",
    "    if kwd not in model.vocab:\n",
    "        all_kwds.remove(kwd)\n",
    "    splt = kwd.split(\" \")\n",
    "    # add in single word tokens from keywords\n",
    "    if splt and len(splt) > 1:\n",
    "        for wd in splt:\n",
    "            if wd.strip() and wd in model.vocab:\n",
    "                all_kwds.add(wd)\n",
    "\n",
    "id2kwd = dict()\n",
    "kwd2id = dict()\n",
    "vectors = []\n",
    "for term in all_kwds:\n",
    "    id2kwd[len(vectors)] = term\n",
    "    kwd2id[term] = len(vectors)\n",
    "    vec = get_norm_vector(term, model)\n",
    "    vectors.append(vec)\n",
    "\n",
    "len(all_kwds), len(vectors)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Clustering vectors into 3000 clusters\n",
      "Initialization complete\n",
      "Iteration  0, inertia 19544.329\n",
      "Iteration  1, inertia 12955.389\n",
      "Iteration  2, inertia 12826.108\n",
      "Iteration  3, inertia 12788.313\n",
      "Iteration  4, inertia 12771.860\n",
      "Iteration  5, inertia 12764.087\n",
      "Iteration  6, inertia 12759.219\n",
      "Iteration  7, inertia 12756.924\n",
      "Iteration  8, inertia 12755.063\n",
      "Iteration  9, inertia 12754.322\n",
      "Iteration 10, inertia 12753.633\n",
      "Iteration 11, inertia 12753.162\n",
      "Iteration 12, inertia 12753.076\n",
      "Iteration 13, inertia 12753.031\n",
      "Iteration 14, inertia 12752.941\n",
      "Iteration 15, inertia 12752.919\n",
      "Iteration 16, inertia 12752.891\n",
      "Converged at iteration 16\n",
      "Initialization complete\n",
      "Iteration  0, inertia 19562.936\n",
      "Iteration  1, inertia 12963.549\n",
      "Iteration  2, inertia 12835.096\n",
      "Iteration  3, inertia 12793.454\n",
      "Iteration  4, inertia 12775.182\n",
      "Iteration  5, inertia 12764.331\n",
      "Iteration  6, inertia 12757.906\n",
      "Iteration  7, inertia 12754.767\n",
      "Iteration  8, inertia 12753.236\n",
      "Iteration  9, inertia 12752.421\n",
      "Iteration 10, inertia 12752.234\n",
      "Iteration 11, inertia 12752.218\n",
      "Converged at iteration 11\n",
      "Initialization complete\n",
      "Iteration  0, inertia 19551.047\n",
      "Iteration  1, inertia 12962.796\n",
      "Iteration  2, inertia 12837.073\n",
      "Iteration  3, inertia 12795.039\n",
      "Iteration  4, inertia 12775.550\n",
      "Iteration  5, inertia 12766.712\n",
      "Iteration  6, inertia 12762.237\n",
      "Iteration  7, inertia 12760.335\n",
      "Iteration  8, inertia 12759.498\n",
      "Iteration  9, inertia 12759.030\n",
      "Iteration 10, inertia 12758.851\n",
      "Converged at iteration 10\n",
      "Initialization complete\n",
      "Iteration  0, inertia 19545.103\n",
      "Iteration  1, inertia 12943.239\n",
      "Iteration  2, inertia 12818.721\n",
      "Iteration  3, inertia 12778.609\n",
      "Iteration  4, inertia 12761.095\n",
      "Iteration  5, inertia 12753.191\n",
      "Iteration  6, inertia 12749.494\n",
      "Iteration  7, inertia 12747.253\n",
      "Iteration  8, inertia 12745.256\n",
      "Iteration  9, inertia 12743.906\n",
      "Iteration 10, inertia 12742.885\n",
      "Iteration 11, inertia 12742.287\n",
      "Iteration 12, inertia 12741.858\n",
      "Iteration 13, inertia 12741.410\n",
      "Iteration 14, inertia 12741.126\n",
      "Iteration 15, inertia 12741.004\n",
      "Iteration 16, inertia 12740.937\n",
      "Converged at iteration 16\n",
      "Initialization complete\n",
      "Iteration  0, inertia 19562.984\n",
      "Iteration  1, inertia 12958.604\n",
      "Iteration  2, inertia 12828.955\n",
      "Iteration  3, inertia 12785.170\n",
      "Iteration  4, inertia 12766.394\n",
      "Iteration  5, inertia 12759.080\n",
      "Iteration  6, inertia 12755.253\n",
      "Iteration  7, inertia 12752.681\n",
      "Iteration  8, inertia 12751.589\n",
      "Iteration  9, inertia 12751.017\n",
      "Iteration 10, inertia 12750.028\n",
      "Iteration 11, inertia 12749.291\n",
      "Iteration 12, inertia 12748.855\n",
      "Iteration 13, inertia 12748.299\n",
      "Iteration 14, inertia 12747.935\n",
      "Iteration 15, inertia 12747.405\n",
      "Iteration 16, inertia 12746.919\n",
      "Iteration 17, inertia 12746.475\n",
      "Iteration 18, inertia 12746.113\n",
      "Iteration 19, inertia 12745.953\n",
      "Converged at iteration 19\n"
     ]
    }
   ],
   "source": [
    "from sklearn import cluster\n",
    "from sklearn.cluster import KMeans\n",
    "import time\n",
    "start = time.time()\n",
    "\n",
    "# don't parallelize (n_jobs = -1), doesn't seem to work\n",
    "print(\"Clustering vectors into %i clusters\" % NUM_CLUSTERS)\n",
    "km_clusterer = KMeans(n_clusters=NUM_CLUSTERS, n_jobs=1, verbose=1, n_init=5)\n",
    "ids = km_clusterer.fit_predict(vectors)\n",
    "\n",
    "end = time.time()\n",
    "print(\"Creating %i clusters took %i seconds\" % (NUM_CLUSTERS, end - start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 53,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "(3000, 3000)"
      ]
     },
     "execution_count": 53,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lbl2cluster = extract_clusters(ids, id2kwd)\n",
    "lbl2centroid = extract_centroids(km_clusterer)\n",
    "\n",
    "len(lbl2cluster), len(lbl2centroid)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1000 computed out of 25189\n",
      "2000 computed out of 25189\n",
      "3000 computed out of 25189\n",
      "4000 computed out of 25189\n",
      "5000 computed out of 25189\n",
      "6000 computed out of 25189\n",
      "7000 computed out of 25189\n",
      "8000 computed out of 25189\n",
      "9000 computed out of 25189\n",
      "10000 computed out of 25189\n",
      "11000 computed out of 25189\n",
      "12000 computed out of 25189\n",
      "13000 computed out of 25189\n",
      "14000 computed out of 25189\n",
      "15000 computed out of 25189\n",
      "16000 computed out of 25189\n",
      "17000 computed out of 25189\n",
      "18000 computed out of 25189\n",
      "19000 computed out of 25189\n",
      "20000 computed out of 25189\n",
      "21000 computed out of 25189\n",
      "22000 computed out of 25189\n",
      "23000 computed out of 25189\n",
      "24000 computed out of 25189\n",
      "25000 computed out of 25189\n",
      "Sorting the clusters for each of the 25189 keywords took 485 seconds\n"
     ]
    }
   ],
   "source": [
    "import time\n",
    "start = time.time()\n",
    "\n",
    "kwd2cluster_sims = compute_cluster_similarities(all_kwds, kwd2id, vectors, lbl2centroid)\n",
    "end = time.time()\n",
    "print(\"Sorting the clusters for each of the %i keywords took %i seconds\" % (len(all_kwds),end - start))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 101,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "write_most_similar_clusters(NUM_CLUSTER_SYNONYMS, kwd2cluster_sims, SYNONYMS_QRY_FILE, SYNONYMS_INDEX_FILE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "grand_end = time.time()\n",
    "print(\"Cluster generation and processing took %i seconds\" % (grand_end - grand_start))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Examine the Clusters"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 102,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[{'switches'},\n",
       " {'coding testing',\n",
       "  'complete software',\n",
       "  'entire software',\n",
       "  'lifecycle requirements',\n",
       "  'requirements gathering design'},\n",
       " {'build solutions',\n",
       "  'code base',\n",
       "  'code bases',\n",
       "  'codebase',\n",
       "  'define best',\n",
       "  'deliver products',\n",
       "  'existing code',\n",
       "  'existing features',\n",
       "  'existing ones',\n",
       "  'existing product',\n",
       "  'existing production',\n",
       "  'existing services',\n",
       "  'feature',\n",
       "  'feature set',\n",
       "  'fine tune',\n",
       "  'flawless',\n",
       "  'implement enhancements',\n",
       "  'implement product',\n",
       "  'legacy code',\n",
       "  'migration activities',\n",
       "  'necessary changes',\n",
       "  'product architecture',\n",
       "  'quality applications',\n",
       "  'refactor',\n",
       "  'refactoring',\n",
       "  'reuse',\n",
       "  'rewrite',\n",
       "  'usefulness'},\n",
       " {'application hosting',\n",
       "  'application web',\n",
       "  'database web',\n",
       "  'databases web',\n",
       "  'enterprise technologies',\n",
       "  'server setup',\n",
       "  'technologies used'},\n",
       " {'bi business',\n",
       "  'bi stack',\n",
       "  'mdx',\n",
       "  'olap',\n",
       "  'sql bi',\n",
       "  'ssas',\n",
       "  'ssis',\n",
       "  'ssis developer',\n",
       "  'ssis ssas',\n",
       "  'ssrs ssas'},\n",
       " {'bi lingual',\n",
       "  'bilingual',\n",
       "  'chinese',\n",
       "  'fluent',\n",
       "  'french',\n",
       "  'german',\n",
       "  'japanese',\n",
       "  'korean',\n",
       "  'lingual',\n",
       "  'localized',\n",
       "  'portuguese',\n",
       "  'russian',\n",
       "  'spanish',\n",
       "  'speak',\n",
       "  'speaker'},\n",
       " {'connectivity',\n",
       "  'firewalls routers switches',\n",
       "  'firewalls switches',\n",
       "  'infrastructure environment',\n",
       "  'load balancers firewalls',\n",
       "  'mobile computing',\n",
       "  'network components',\n",
       "  'network devices',\n",
       "  'network hardware',\n",
       "  'network operating systems',\n",
       "  'network servers',\n",
       "  'networking equipment',\n",
       "  'networks servers',\n",
       "  'operating systems applications',\n",
       "  'operating systems network',\n",
       "  'platforms operating systems',\n",
       "  'routers switches firewalls',\n",
       "  'security hardware',\n",
       "  'servers network',\n",
       "  'servers networking',\n",
       "  'servers storage',\n",
       "  'switches routers',\n",
       "  'switches routers firewalls',\n",
       "  'systems servers'},\n",
       " {'detail design', 'requirements review'},\n",
       " {'company specializing',\n",
       "  'consulting firm',\n",
       "  'consulting organization',\n",
       "  'encouraged',\n",
       "  'genesis10',\n",
       "  'matlen silver',\n",
       "  'moment',\n",
       "  'prefered',\n",
       "  'providing services',\n",
       "  'pwc',\n",
       "  'sponsored',\n",
       "  'sponsoring',\n",
       "  'sponsorships'},\n",
       " {'narrative'},\n",
       " {'project progress'},\n",
       " {'attention',\n",
       "  'attentive',\n",
       "  'close attention',\n",
       "  'compromising',\n",
       "  'conscientious',\n",
       "  'conscious',\n",
       "  'customer oriented',\n",
       "  'customer service',\n",
       "  'customer service communication',\n",
       "  'customer service focus',\n",
       "  'customer service oriented',\n",
       "  'deliver results',\n",
       "  'delivering results',\n",
       "  'demonstrated commitment',\n",
       "  'dependability',\n",
       "  'dependable',\n",
       "  'detailed oriented',\n",
       "  'diligence',\n",
       "  'diligent',\n",
       "  'do attitude',\n",
       "  'ethic',\n",
       "  'excellent follow',\n",
       "  'extremely detail oriented',\n",
       "  'good attention',\n",
       "  'great customer service',\n",
       "  'habits',\n",
       "  'impeccable',\n",
       "  'meticulous',\n",
       "  'meticulous attention',\n",
       "  'organized',\n",
       "  'orientated',\n",
       "  'outgoing',\n",
       "  'outstanding customer service',\n",
       "  'pay attention',\n",
       "  'personality',\n",
       "  'pleasant',\n",
       "  'positive attitude',\n",
       "  'producing quality',\n",
       "  'professional appearance',\n",
       "  'professional attitude',\n",
       "  'professional demeanor',\n",
       "  'punctual',\n",
       "  'punctuality',\n",
       "  'quality customer service',\n",
       "  'quality results',\n",
       "  'quantity',\n",
       "  'sacrificing',\n",
       "  'self motivated detail oriented',\n",
       "  'self motivation',\n",
       "  'superb',\n",
       "  'superior',\n",
       "  'thoroughness',\n",
       "  'traits'},\n",
       " {'process definition',\n",
       "  'process owner',\n",
       "  'process re engineering',\n",
       "  'quality tools',\n",
       "  're engineer'},\n",
       " {'arrangements',\n",
       "  'assistance program',\n",
       "  'cafeteria',\n",
       "  'child',\n",
       "  'club',\n",
       "  'commuter',\n",
       "  'credit union',\n",
       "  'discount',\n",
       "  'discounted',\n",
       "  'discounts',\n",
       "  'eap',\n",
       "  'employee assistance',\n",
       "  'employee assistance program',\n",
       "  'employee stock purchase',\n",
       "  'fitness center',\n",
       "  'flex',\n",
       "  'gym membership',\n",
       "  'parking',\n",
       "  'passes',\n",
       "  'reimbursement',\n",
       "  'subsidized',\n",
       "  'supplement',\n",
       "  'tuition assistance',\n",
       "  'vision plans',\n",
       "  'wellness'},\n",
       " {'cocoa', 'swift'},\n",
       " {'create prototypes', 'explore', 'illustrate'},\n",
       " {'document system',\n",
       "  'drive solutions',\n",
       "  'implement processes',\n",
       "  'system modifications'},\n",
       " {'enterprise performance', 'hhs'},\n",
       " {'accustomed',\n",
       "  'adapt',\n",
       "  'adapt quickly',\n",
       "  'adaptability',\n",
       "  'adaptable',\n",
       "  'adapting',\n",
       "  'aggressive',\n",
       "  'ambiguous',\n",
       "  'broad base',\n",
       "  'busy',\n",
       "  'challenging environment',\n",
       "  'changing business',\n",
       "  'changing environment',\n",
       "  'changing priorities',\n",
       "  'changing requirements',\n",
       "  'comfortable working',\n",
       "  'comfortably',\n",
       "  'constantly changing',\n",
       "  'demands',\n",
       "  'dynamic',\n",
       "  'dynamic business',\n",
       "  'dynamic environment',\n",
       "  'dynamic fast',\n",
       "  'dynamic fast paced',\n",
       "  'embrace change',\n",
       "  'entrepreneurial environment',\n",
       "  'environment managing',\n",
       "  'ever changing',\n",
       "  'extremely fast',\n",
       "  'fast changing',\n",
       "  'fast pace',\n",
       "  'fast pace environment',\n",
       "  'fast paced',\n",
       "  'fast paced agile',\n",
       "  'fast paced agile environment',\n",
       "  'fast paced business',\n",
       "  'fast paced changing',\n",
       "  'fast paced deadline',\n",
       "  'fast paced deadline driven',\n",
       "  'fast paced demanding',\n",
       "  'fast paced dynamic',\n",
       "  'fast paced dynamic environment',\n",
       "  'fast paced environment',\n",
       "  'fast paced ever',\n",
       "  'fast paced highly',\n",
       "  'fast paced results',\n",
       "  'flexibility',\n",
       "  'flexible',\n",
       "  'focused environment',\n",
       "  'function effectively',\n",
       "  'gears',\n",
       "  'highly dynamic',\n",
       "  'interrupt',\n",
       "  'operate effectively',\n",
       "  'operate independently',\n",
       "  'paced',\n",
       "  'produce results',\n",
       "  'productively',\n",
       "  'project environment',\n",
       "  'quick turnaround',\n",
       "  'quickly adapt',\n",
       "  'respond quickly',\n",
       "  'self motivated team player',\n",
       "  'shifting',\n",
       "  'shifting priorities',\n",
       "  'somewhat',\n",
       "  'technical environment',\n",
       "  'thrive',\n",
       "  'tight',\n",
       "  'tight timelines',\n",
       "  'turnaround',\n",
       "  'whilst'},\n",
       " {'advanced system',\n",
       "  'computer operating systems',\n",
       "  'desktop environment',\n",
       "  'desktop operating',\n",
       "  'enterprise management',\n",
       "  'hardware',\n",
       "  'hardware software',\n",
       "  'hardware systems',\n",
       "  'installation configuration maintenance',\n",
       "  'installation configuration operation',\n",
       "  'lifecycle management',\n",
       "  'network operating',\n",
       "  'operating system software',\n",
       "  'operating systems hardware',\n",
       "  'operating systems software',\n",
       "  'oss',\n",
       "  'server desktop',\n",
       "  'server system',\n",
       "  'software operating',\n",
       "  'software operating systems',\n",
       "  'system environment',\n",
       "  'system hardware',\n",
       "  'system infrastructure',\n",
       "  'system platforms',\n",
       "  'system software',\n",
       "  'systems operations',\n",
       "  'troubleshooting software',\n",
       "  'troubleshooting support',\n",
       "  'windows server operating',\n",
       "  'windows support'},\n",
       " {'capacity',\n",
       "  'computer equipment',\n",
       "  'devises',\n",
       "  'general direction',\n",
       "  'limitations',\n",
       "  'limitations operating',\n",
       "  'modifies procedures',\n",
       "  'solve complex',\n",
       "  'solve complex business',\n",
       "  'solve complex problems',\n",
       "  'system capacity',\n",
       "  'system s'},\n",
       " {'4.x', '5.x', 'esxi', 'vmware esxi'},\n",
       " {'age',\n",
       "  'age color',\n",
       "  'age disability veteran status',\n",
       "  'basis',\n",
       "  'category',\n",
       "  'color national origin',\n",
       "  'disability age',\n",
       "  'disability protected veteran status',\n",
       "  'disability veteran status',\n",
       "  'discriminate',\n",
       "  'ethnicity',\n",
       "  'expression',\n",
       "  'genetic',\n",
       "  'genetic information',\n",
       "  'genetics',\n",
       "  'identity disability',\n",
       "  'information marital status',\n",
       "  'legally',\n",
       "  'legally protected',\n",
       "  'marital',\n",
       "  'marital status',\n",
       "  'military',\n",
       "  'military service',\n",
       "  'national origin',\n",
       "  'national origin ancestry',\n",
       "  'national origin disability',\n",
       "  'perceived',\n",
       "  'prokarma',\n",
       "  'protected',\n",
       "  'race',\n",
       "  'status age',\n",
       "  'unrelated',\n",
       "  'veteran status',\n",
       "  'veteran status disability'},\n",
       " {'.net asp',\n",
       "  'asp',\n",
       "  'c# java',\n",
       "  'c# vb',\n",
       "  'cold fusion',\n",
       "  'coldfusion',\n",
       "  'software analysis',\n",
       "  'vb',\n",
       "  'visual basic'},\n",
       " {'application development',\n",
       "  'client server application',\n",
       "  'coding debugging',\n",
       "  'complete sdlc',\n",
       "  'complex software systems',\n",
       "  'cross platform',\n",
       "  'different software',\n",
       "  'end software',\n",
       "  'enterprise class software',\n",
       "  'managing software',\n",
       "  'overall software',\n",
       "  'performing application',\n",
       "  'production software',\n",
       "  'professional software',\n",
       "  'qa software testing',\n",
       "  'requirements design implementation',\n",
       "  'software development',\n",
       "  'software integration',\n",
       "  'software life cycle',\n",
       "  'software quality assurance',\n",
       "  'software testing',\n",
       "  'systems environments',\n",
       "  'systems level',\n",
       "  'test qa',\n",
       "  'testing programs',\n",
       "  'years+'},\n",
       " {'analysis testing',\n",
       "  'automated solutions',\n",
       "  'coordinate system',\n",
       "  'coordinate test',\n",
       "  'design test plans',\n",
       "  'end performance',\n",
       "  'execute software',\n",
       "  'execution',\n",
       "  'newly developed',\n",
       "  'perform test',\n",
       "  'requirements testing',\n",
       "  'scenarios',\n",
       "  'support test',\n",
       "  'support testing',\n",
       "  'support user',\n",
       "  'system integration',\n",
       "  'system test',\n",
       "  'testing activities'},\n",
       " {'dhtml', 'xml html'},\n",
       " {'batch processes',\n",
       "  'build data',\n",
       "  'cdc',\n",
       "  'change data capture',\n",
       "  'dac',\n",
       "  'event processing',\n",
       "  'optim',\n",
       "  'oracle warehouse builder',\n",
       "  'partition',\n",
       "  'power center',\n",
       "  'quantities'},\n",
       " {'elite', 'norcross', 'pega architect'},\n",
       " {'html5 css3 jquery'},\n",
       " {'alike',\n",
       "  'alliances',\n",
       "  'appropriate communication',\n",
       "  'better understanding',\n",
       "  'build rapport',\n",
       "  'build relationships',\n",
       "  'building relationships',\n",
       "  'builds relationships',\n",
       "  'business contacts',\n",
       "  'business relationships',\n",
       "  'client organization',\n",
       "  'client relationships',\n",
       "  'clients vendors',\n",
       "  'collaborative relationships',\n",
       "  'collaborative working',\n",
       "  'communication collaboration',\n",
       "  'company management',\n",
       "  'constituent',\n",
       "  'constituents',\n",
       "  'cooperation',\n",
       "  'cooperative',\n",
       "  'cooperative working',\n",
       "  'credibility',\n",
       "  'cultivate',\n",
       "  'cultivates',\n",
       "  'cultivating',\n",
       "  'customer relationships',\n",
       "  'cxo',\n",
       "  'develop relationships',\n",
       "  'director vp',\n",
       "  'effective relationships',\n",
       "  'effective working',\n",
       "  'effective working relationships',\n",
       "  'effectively build',\n",
       "  'establish credibility',\n",
       "  'establish relationships',\n",
       "  'excellent people',\n",
       "  'good working relationships',\n",
       "  'influencers',\n",
       "  'information exchange',\n",
       "  'internal team members',\n",
       "  'interpersonal relationship',\n",
       "  'maintain effective',\n",
       "  'maintain effective working relationships',\n",
       "  'maintain excellent',\n",
       "  'maintain good',\n",
       "  'maintain positive',\n",
       "  'maintaining effective',\n",
       "  'maintains effective',\n",
       "  'maintains relationships',\n",
       "  'outstanding customer',\n",
       "  'positive customer',\n",
       "  'positive relationships',\n",
       "  'positive working',\n",
       "  'productive',\n",
       "  'productive relationships',\n",
       "  'productive working',\n",
       "  'professional business',\n",
       "  'professional communication',\n",
       "  'professional relationships',\n",
       "  'professional working',\n",
       "  'rapport',\n",
       "  'relations',\n",
       "  'relationships',\n",
       "  'relationships throughout',\n",
       "  'respectful',\n",
       "  'trusting',\n",
       "  'win win',\n",
       "  'working relationships'},\n",
       " {'good documentation', 'microsoft excel word', 'solid working'},\n",
       " {'cordova', 'end technologies', 'js jquery'},\n",
       " {'account executives',\n",
       "  'account managers',\n",
       "  'account planning',\n",
       "  'account teams',\n",
       "  'alliance',\n",
       "  'channel',\n",
       "  'channel partners',\n",
       "  'client partner',\n",
       "  'develop customer',\n",
       "  'existing clients',\n",
       "  'field sales',\n",
       "  'integrators',\n",
       "  'internal sales',\n",
       "  'market share',\n",
       "  'partner relationships',\n",
       "  'pre sales support',\n",
       "  'pre sales technical',\n",
       "  'presales',\n",
       "  'product sales',\n",
       "  'rep',\n",
       "  'reps',\n",
       "  'reseller',\n",
       "  'resellers',\n",
       "  'sales',\n",
       "  'sales account',\n",
       "  'sales director',\n",
       "  'sales management',\n",
       "  'sales managers',\n",
       "  'sales process',\n",
       "  'sales product',\n",
       "  'sales representative',\n",
       "  'sales representatives',\n",
       "  'sales reps',\n",
       "  'sales sales',\n",
       "  'sales strategy',\n",
       "  'sales team',\n",
       "  'sales teams',\n",
       "  'sales technical',\n",
       "  'se',\n",
       "  'services sales',\n",
       "  'solutions architects',\n",
       "  'strategic partners',\n",
       "  'technical account',\n",
       "  'technology sales',\n",
       "  'territory'},\n",
       " {'attend',\n",
       "  'attend meetings',\n",
       "  'attend project',\n",
       "  'attends',\n",
       "  'chair',\n",
       "  'conference calls',\n",
       "  'discussion',\n",
       "  'kick',\n",
       "  'kickoff',\n",
       "  'management meetings',\n",
       "  'meeting',\n",
       "  'meetings',\n",
       "  'project meetings',\n",
       "  'regularly scheduled',\n",
       "  'review meetings'},\n",
       " {'advising',\n",
       "  'champions',\n",
       "  'directors',\n",
       "  'leadership direction',\n",
       "  'lower level',\n",
       "  'mentor',\n",
       "  'mentor train',\n",
       "  'mentorship',\n",
       "  'principals',\n",
       "  'proper use',\n",
       "  'provides mentoring',\n",
       "  'providing technical guidance',\n",
       "  'sharing knowledge',\n",
       "  'training staff',\n",
       "  'transfer knowledge'},\n",
       " {'creating software',\n",
       "  'go',\n",
       "  'languages python',\n",
       "  'proven hands',\n",
       "  'ruby java',\n",
       "  'ruby python'},\n",
       " {'collected', 'evaluate data', 'relevance', 'validate data'},\n",
       " {'avro', 'hdfs', 'kafka', 'serialization', 'structured data', 'thrift'},\n",
       " {'basic scripting',\n",
       "  'batch files',\n",
       "  'dos',\n",
       "  'installshield',\n",
       "  'nt',\n",
       "  'power shell',\n",
       "  'powershell',\n",
       "  'powershell perl',\n",
       "  'powershell scripting',\n",
       "  'powershell vbscript',\n",
       "  'scripting powershell',\n",
       "  'scripting technologies',\n",
       "  'vb script',\n",
       "  'vb scripting',\n",
       "  'vbscript',\n",
       "  'windows powershell',\n",
       "  'windows scripting',\n",
       "  'wmi'},\n",
       " {'advanced networking',\n",
       "  'basic network',\n",
       "  'basic networking',\n",
       "  'distributed computing environment',\n",
       "  'firewalls load',\n",
       "  'firewalls load balancers',\n",
       "  'firewalls routers',\n",
       "  'general networking',\n",
       "  'infrastructure knowledge',\n",
       "  'ip networking',\n",
       "  'network technologies',\n",
       "  'network topologies',\n",
       "  'networking',\n",
       "  'networking concepts',\n",
       "  'networking principles',\n",
       "  'networking tcp ip',\n",
       "  'networking technologies',\n",
       "  'osi model',\n",
       "  'routers hubs',\n",
       "  'routing firewalls',\n",
       "  'security fundamentals',\n",
       "  'security networking',\n",
       "  'tcp ip dns dhcp',\n",
       "  'tcp ip network',\n",
       "  'tcp ip networking',\n",
       "  'tcp ip protocol',\n",
       "  'tcp ip routing'},\n",
       " {'coordinated',\n",
       "  'ensure testing',\n",
       "  'held accountable',\n",
       "  'production support teams',\n",
       "  'program team',\n",
       "  'project testing',\n",
       "  'qa manager',\n",
       "  'qa resources',\n",
       "  'qa team',\n",
       "  'team leader',\n",
       "  'test efforts',\n",
       "  'test teams',\n",
       "  'testing efforts',\n",
       "  'testing team',\n",
       "  'tracking issues'},\n",
       " {'adobe',\n",
       "  'adobe analytics',\n",
       "  'create custom',\n",
       "  'sitecatalyst',\n",
       "  'web analytics'},\n",
       " {'dm',\n",
       "  'material management',\n",
       "  'mrp',\n",
       "  'plant maintenance',\n",
       "  'production planning'},\n",
       " {'continual service improvement',\n",
       "  'incident tickets',\n",
       "  'managing it',\n",
       "  'request fulfillment',\n",
       "  'service management itsm'},\n",
       " {'administering windows',\n",
       "  'basic windows',\n",
       "  'client operating',\n",
       "  'current versions',\n",
       "  'desktop operating systems',\n",
       "  'filemaker',\n",
       "  'intermediate windows',\n",
       "  'mac',\n",
       "  'mac operating',\n",
       "  'mac os',\n",
       "  'mac os x',\n",
       "  'mac osx',\n",
       "  'macintosh',\n",
       "  'maintaining windows',\n",
       "  'managing windows',\n",
       "  'microsoft desktop',\n",
       "  'microsoft operating',\n",
       "  'microsoft operating systems',\n",
       "  'microsoft os',\n",
       "  'microsoft windows',\n",
       "  'microsoft windows desktop',\n",
       "  'microsoft windows operating',\n",
       "  'ms server',\n",
       "  'ms windows',\n",
       "  'operating systems microsoft',\n",
       "  'os windows',\n",
       "  'os x',\n",
       "  'osx',\n",
       "  'pc operating',\n",
       "  'platforms windows',\n",
       "  'server linux',\n",
       "  'server operating system',\n",
       "  'server operating systems',\n",
       "  'server windows',\n",
       "  'supporting windows',\n",
       "  'technologies windows',\n",
       "  'troubleshooting microsoft',\n",
       "  'troubleshooting windows',\n",
       "  'win7',\n",
       "  'windows',\n",
       "  'windows 7',\n",
       "  'windows desktop',\n",
       "  'windows desktop operating',\n",
       "  'windows environment',\n",
       "  'windows mac',\n",
       "  'windows microsoft',\n",
       "  'windows network',\n",
       "  'windows nt',\n",
       "  'windows operating system',\n",
       "  'windows operating systems',\n",
       "  'windows platforms',\n",
       "  'windows system administration',\n",
       "  'windows xp',\n",
       "  'windows xp windows',\n",
       "  'xp windows'},\n",
       " {'acceleration',\n",
       "  'ace',\n",
       "  'acs',\n",
       "  'appliances',\n",
       "  'balancer',\n",
       "  'balancers',\n",
       "  'big ip',\n",
       "  'bigip',\n",
       "  'blue coat',\n",
       "  'citrix netscaler',\n",
       "  'f5',\n",
       "  'f5 big ip',\n",
       "  'f5 load',\n",
       "  'f5 load balancing',\n",
       "  'f5 ltm',\n",
       "  'gateway',\n",
       "  'gtm',\n",
       "  'health monitoring',\n",
       "  'imperva',\n",
       "  'infoblox',\n",
       "  'load balancer',\n",
       "  'load balancers',\n",
       "  'load balancing',\n",
       "  'ltm',\n",
       "  'ltm gtm',\n",
       "  'network firewalls',\n",
       "  'network load',\n",
       "  'prior hands',\n",
       "  'proxies',\n",
       "  'proxy',\n",
       "  'proxy servers',\n",
       "  'reverse',\n",
       "  'server load',\n",
       "  'server load balancing',\n",
       "  'ssl certificates',\n",
       "  'ssl vpn',\n",
       "  'tacacs',\n",
       "  'vip',\n",
       "  'waf',\n",
       "  'web application firewall',\n",
       "  'websense'},\n",
       " {'client projects',\n",
       "  'company working',\n",
       "  'consumer product',\n",
       "  'development manager',\n",
       "  'mobile product',\n",
       "  'played'},\n",
       " {'center security',\n",
       "  'deployment strategies',\n",
       "  'designs solutions',\n",
       "  'directional',\n",
       "  'lead efforts',\n",
       "  'mscs',\n",
       "  'network equipment'},\n",
       " {'basking ridge', 'eden', 'franklin', 'managment', 'resides'},\n",
       " {'articulate',\n",
       "  'complex business',\n",
       "  'it team',\n",
       "  'technical terms',\n",
       "  'translate business requirements',\n",
       "  'translate functional requirements',\n",
       "  'users it'},\n",
       " {'iso iec', 'soc', 'ssae', 'ssae16'},\n",
       " {'production support team'},\n",
       " {'advanced degrees',\n",
       "  'b.s',\n",
       "  'bachelor',\n",
       "  'bachelors',\n",
       "  'bs',\n",
       "  'bs computer science',\n",
       "  'bs cs',\n",
       "  'bs degree',\n",
       "  'bs ms',\n",
       "  'bs ms degree',\n",
       "  'computer engineering',\n",
       "  'computer engineering electrical engineering',\n",
       "  'computer science engineering',\n",
       "  'cs',\n",
       "  'cs ee',\n",
       "  'degree',\n",
       "  'directly relevant',\n",
       "  'educational requirements',\n",
       "  'ee',\n",
       "  'ee cs',\n",
       "  'electrical engineering computer science',\n",
       "  'engineer level',\n",
       "  'engineering computer science',\n",
       "  'engineering degree',\n",
       "  'equivalent knowledge',\n",
       "  'ged',\n",
       "  'higher degree',\n",
       "  'hs',\n",
       "  'hs diploma',\n",
       "  'it computer science',\n",
       "  'lieu',\n",
       "  'master degree',\n",
       "  'masters',\n",
       "  'masters degree',\n",
       "  'ms',\n",
       "  'ms degree',\n",
       "  'phd',\n",
       "  'progressive it',\n",
       "  'science computer science',\n",
       "  'science degree',\n",
       "  'similar technical',\n",
       "  'substituted',\n",
       "  'technical degree',\n",
       "  'technical field',\n",
       "  'typical minimum education'},\n",
       " {'archives',\n",
       "  'company web',\n",
       "  'content',\n",
       "  'content management system cms',\n",
       "  'digital assets',\n",
       "  'sharepoint sites',\n",
       "  'web platform',\n",
       "  'web presence',\n",
       "  'web site'},\n",
       " {'access powerpoint',\n",
       "  'applications excel',\n",
       "  'excel access powerpoint',\n",
       "  'excellent computer',\n",
       "  'literate',\n",
       "  'microsoft office applications',\n",
       "  'microsoft office software',\n",
       "  'microsoft word',\n",
       "  'microsoft word excel powerpoint',\n",
       "  'ms office applications',\n",
       "  'ms word excel',\n",
       "  'outlook excel',\n",
       "  'outlook word',\n",
       "  'powerpoint outlook',\n",
       "  'powerpoint project',\n",
       "  'use microsoft',\n",
       "  'word excel',\n",
       "  'word excel access',\n",
       "  'word excel powerpoint access',\n",
       "  'word outlook'},\n",
       " {'climate',\n",
       "  'frequent',\n",
       "  'indoor',\n",
       "  'normally',\n",
       "  'office environment',\n",
       "  'office setting',\n",
       "  'standard office'},\n",
       " {'area',\n",
       "  'chester',\n",
       "  'corporate headquarters',\n",
       "  'corporate office',\n",
       "  'grand',\n",
       "  'metro area',\n",
       "  'mill',\n",
       "  'mills',\n",
       "  'san',\n",
       "  'san mateo',\n",
       "  'suburban',\n",
       "  'toronto',\n",
       "  'twin'},\n",
       " {'active current',\n",
       "  'active dod',\n",
       "  'active secret',\n",
       "  'active secret clearance',\n",
       "  'clearance northrop grumman',\n",
       "  'cleared',\n",
       "  'current active',\n",
       "  'current dod',\n",
       "  'current secret',\n",
       "  'given preferential consideration',\n",
       "  'interim secret',\n",
       "  'sar',\n",
       "  'sci',\n",
       "  'sci clearance',\n",
       "  'secret',\n",
       "  'secret sci',\n",
       "  'ssbi',\n",
       "  'ts',\n",
       "  'ts sci w'},\n",
       " {'code design', 'code written', 'procs', 'sequences', 'troubleshoot sql'},\n",
       " {'adherence',\n",
       "  'appropriateness',\n",
       "  'approving',\n",
       "  'assurance activities',\n",
       "  'assurance reviews',\n",
       "  'assurance team',\n",
       "  'audits',\n",
       "  'checklist',\n",
       "  'checklists',\n",
       "  'conduct internal',\n",
       "  'conduct quality',\n",
       "  'contract requirements',\n",
       "  'control documentation',\n",
       "  'customer specifications',\n",
       "  'defined processes',\n",
       "  'documented',\n",
       "  'ensure adherence',\n",
       "  'ensure compliance',\n",
       "  'impact assessments',\n",
       "  'management function',\n",
       "  'management procedures',\n",
       "  'management reviews',\n",
       "  'overall quality',\n",
       "  'perform quality',\n",
       "  'performing quality',\n",
       "  'performs quality',\n",
       "  'periodic reviews',\n",
       "  'preliminary',\n",
       "  'project process',\n",
       "  'project reviews',\n",
       "  'quality checks',\n",
       "  'quality control',\n",
       "  'quality requirements',\n",
       "  'quality review',\n",
       "  'quality reviews',\n",
       "  'readiness reviews',\n",
       "  'review',\n",
       "  'review approval',\n",
       "  'review documentation',\n",
       "  'review processes',\n",
       "  'review technical',\n",
       "  'reviews',\n",
       "  'systems designs',\n",
       "  'technical accuracy',\n",
       "  'technical review',\n",
       "  'testing procedures',\n",
       "  'walk throughs'},\n",
       " {'built relationships',\n",
       "  'contacted',\n",
       "  'dynamics ax',\n",
       "  'dynamics crm',\n",
       "  'dynamics gp',\n",
       "  'dynamics nav',\n",
       "  'employers',\n",
       "  'i',\n",
       "  'market i',\n",
       "  'microsoft dynamics market',\n",
       "  'placing',\n",
       "  'specializing solely',\n",
       "  'unrivaled'},\n",
       " {'accomplish goals',\n",
       "  'accomplish tasks',\n",
       "  'communicate findings',\n",
       "  'complex situations',\n",
       "  'decision making process',\n",
       "  'decisions',\n",
       "  'decisive',\n",
       "  'discretion',\n",
       "  'excellent judgment',\n",
       "  'exercise good',\n",
       "  'exercise independent judgment',\n",
       "  'exercise sound',\n",
       "  'good judgment',\n",
       "  'judgement',\n",
       "  'judgment',\n",
       "  'judgments',\n",
       "  'problem solving troubleshooting',\n",
       "  'relying',\n",
       "  'situational',\n",
       "  'solves problems',\n",
       "  'sound decisions',\n",
       "  'sound judgment'},\n",
       " {'create process',\n",
       "  'data process',\n",
       "  'develop process',\n",
       "  'document process',\n",
       "  'gained',\n",
       "  'maps',\n",
       "  'procedural',\n",
       "  'process mapping',\n",
       "  'process maps',\n",
       "  'use knowledge'},\n",
       " {'agile testing',\n",
       "  'atdd',\n",
       "  'bdd',\n",
       "  'behavior driven',\n",
       "  'cucumber',\n",
       "  'gherkin',\n",
       "  'pair',\n",
       "  'tdd',\n",
       "  'tdd bdd',\n",
       "  'tdd test driven',\n",
       "  'test driven'},\n",
       " {'day maintenance', 'day operation', 'monitoring maintenance'},\n",
       " {'c c++ programming',\n",
       "  'device driver',\n",
       "  'device drivers',\n",
       "  'embedded c',\n",
       "  'embedded linux',\n",
       "  'embedded real',\n",
       "  'embedded software',\n",
       "  'embedded systems',\n",
       "  'qnx',\n",
       "  'rtos',\n",
       "  'vxworks',\n",
       "  'wind'},\n",
       " {'advanced software',\n",
       "  'flight test',\n",
       "  'missile',\n",
       "  'operationally',\n",
       "  'operator',\n",
       "  'payload',\n",
       "  'telemetry',\n",
       "  'weapon',\n",
       "  'weapons'},\n",
       " {'11i', 'oracle r12', 'r12'},\n",
       " {'build consensus',\n",
       "  'business groups',\n",
       "  'business it',\n",
       "  'business partners',\n",
       "  'business sponsors',\n",
       "  'business stakeholders',\n",
       "  'business technical',\n",
       "  'client stakeholders',\n",
       "  'cross functional business',\n",
       "  'cross functional groups',\n",
       "  'cross functional teams',\n",
       "  'facilitate discussions',\n",
       "  'influence',\n",
       "  'internal stakeholders',\n",
       "  'internal teams',\n",
       "  'it business',\n",
       "  'it teams',\n",
       "  'leaders',\n",
       "  'organization',\n",
       "  'partners',\n",
       "  'sponsors',\n",
       "  'stakeholders',\n",
       "  'teams',\n",
       "  'technology stakeholders'},\n",
       " {'branching',\n",
       "  'branching merging',\n",
       "  'build deploy',\n",
       "  'build process',\n",
       "  'build release',\n",
       "  'code deployment',\n",
       "  'code management',\n",
       "  'code repository',\n",
       "  'cvs',\n",
       "  'deployment processes',\n",
       "  'environment configuration',\n",
       "  'management build',\n",
       "  'practices tools',\n",
       "  'repositories',\n",
       "  'revision control',\n",
       "  'source code',\n",
       "  'source code control',\n",
       "  'source code management',\n",
       "  'source control',\n",
       "  'support tools',\n",
       "  'version control',\n",
       "  'versioning'},\n",
       " {'daily weekly monthly',\n",
       "  'data accuracy',\n",
       "  'ensure accurate',\n",
       "  'monitor',\n",
       "  'monitor analyze',\n",
       "  'monitor daily',\n",
       "  'nightly',\n",
       "  'perform daily',\n",
       "  'perform general',\n",
       "  'perform regular',\n",
       "  'system setup'},\n",
       " {'actionable',\n",
       "  'complicated',\n",
       "  'consumable',\n",
       "  'crisp',\n",
       "  'derive',\n",
       "  'distill',\n",
       "  'research findings',\n",
       "  'translate complex',\n",
       "  'translate those'},\n",
       " {'business consultant',\n",
       "  'enterprise risk management',\n",
       "  'information security risk management',\n",
       "  'internal audit',\n",
       "  'it audit',\n",
       "  'it auditor',\n",
       "  'it compliance',\n",
       "  'it internal',\n",
       "  'it risk',\n",
       "  'it risk management',\n",
       "  'management industry',\n",
       "  'public accounting',\n",
       "  'risk analyst',\n",
       "  'technology risk management'},\n",
       " {'definitely',\n",
       "  'direct contact',\n",
       "  'emails',\n",
       "  'frank',\n",
       "  'mailing',\n",
       "  'nigel',\n",
       "  'receiving',\n",
       "  'ref',\n",
       "  'removed',\n",
       "  'reply'},\n",
       " {'cloud computing',\n",
       "  'cutting edge technologies',\n",
       "  'data centers',\n",
       "  'deliver highly',\n",
       "  'delivering highly',\n",
       "  'devops team',\n",
       "  'different technology',\n",
       "  'dynamically',\n",
       "  'edge cloud',\n",
       "  'efficient delivery',\n",
       "  'integrated systems',\n",
       "  'scale',\n",
       "  'scaling',\n",
       "  'services platform'},\n",
       " {'graphic designers', 'software web', 'web designers'},\n",
       " {'academic',\n",
       "  'applied',\n",
       "  'behavioral',\n",
       "  'bioinformatics',\n",
       "  'biological',\n",
       "  'business analytics',\n",
       "  'business marketing',\n",
       "  'data science',\n",
       "  'fields',\n",
       "  'genomics',\n",
       "  'pursuing',\n",
       "  'science technology',\n",
       "  'similarly',\n",
       "  'technical communications'},\n",
       " {'abap',\n",
       "  'application integration',\n",
       "  'custom data',\n",
       "  'es',\n",
       "  'product configuration',\n",
       "  'sap abap',\n",
       "  'sap erp',\n",
       "  'sap pi po',\n",
       "  'sap portal',\n",
       "  'sap workflow',\n",
       "  'technical configuration',\n",
       "  'workbench'},\n",
       " {'css3 html5'},\n",
       " {'attainment',\n",
       "  'business sales',\n",
       "  'client relationship',\n",
       "  'exceed',\n",
       "  'exceeded',\n",
       "  'exceeding',\n",
       "  'monthly quarterly',\n",
       "  'quota',\n",
       "  'quotas',\n",
       "  'sales goals',\n",
       "  'sales pipeline',\n",
       "  'sales strategies',\n",
       "  'sales targets',\n",
       "  'targets'},\n",
       " {'additional information',\n",
       "  'attach',\n",
       "  'attachment',\n",
       "  'brown',\n",
       "  'current',\n",
       "  'email address',\n",
       "  'emailing',\n",
       "  'inquire',\n",
       "  'mention',\n",
       "  'mentioned',\n",
       "  'ms word format',\n",
       "  'submit resumes'},\n",
       " {'hadoop administrator',\n",
       "  'sql database administrator',\n",
       "  'sql dba',\n",
       "  'sql server dba',\n",
       "  'sql server developer'},\n",
       " {'.net',\n",
       "  '.net html',\n",
       "  '.net programming',\n",
       "  '.net sql',\n",
       "  '.net sql server',\n",
       "  '.net wpf',\n",
       "  'asp .net',\n",
       "  'asp asp.net',\n",
       "  'asp.net visual',\n",
       "  'c#',\n",
       "  'c# programming',\n",
       "  'c# vb.net asp.net',\n",
       "  'c# visual',\n",
       "  'com',\n",
       "  'csharp',\n",
       "  'css sql',\n",
       "  'dot net',\n",
       "  'foxpro',\n",
       "  'javascript c#',\n",
       "  'microsoft visual',\n",
       "  'ms visual',\n",
       "  'server .net',\n",
       "  'server visual',\n",
       "  'sql .net',\n",
       "  'sql c#',\n",
       "  'sql sql server',\n",
       "  'vb.net',\n",
       "  'vb6',\n",
       "  'vc++',\n",
       "  'visual basic .net',\n",
       "  'visual c++',\n",
       "  'visual studio .net',\n",
       "  'visual studio sql'},\n",
       " {'project manager', 'team lead'},\n",
       " {'concepts principles',\n",
       "  'it environments',\n",
       "  'principles practices',\n",
       "  'process procedures',\n",
       "  'standard it',\n",
       "  'technical platforms'},\n",
       " {'base sas',\n",
       "  'business intelligence systems',\n",
       "  'complex reporting',\n",
       "  'data management',\n",
       "  'dwh',\n",
       "  'ets',\n",
       "  'sas',\n",
       "  'sas data',\n",
       "  'sas di',\n",
       "  'sas enterprise',\n",
       "  'solutions leveraging',\n",
       "  'sql business',\n",
       "  'studio',\n",
       "  'unica'},\n",
       " {'single page application',\n",
       "  'single page applications',\n",
       "  'single page web',\n",
       "  'spa'},\n",
       " {'bleeding',\n",
       "  'continually',\n",
       "  'decrease',\n",
       "  'drive improvements',\n",
       "  'drive process',\n",
       "  'efficiencies',\n",
       "  'expedite',\n",
       "  'identify process',\n",
       "  'identify recommend',\n",
       "  'implement improvements',\n",
       "  'implement process',\n",
       "  'implement process improvements',\n",
       "  'implementing process',\n",
       "  'improving processes',\n",
       "  'increase customer',\n",
       "  'intervention',\n",
       "  'minimizes',\n",
       "  'productivity quality',\n",
       "  'recommend design',\n",
       "  'reduce',\n",
       "  'reduce costs',\n",
       "  'share best practices',\n",
       "  'standardize',\n",
       "  'streamline',\n",
       "  'streamline processes'},\n",
       " {'appropriate resources',\n",
       "  'escalate problems',\n",
       "  'established procedures',\n",
       "  'resolve incidents'},\n",
       " {'delivery models', 'iaas', 'saas paas'},\n",
       " {'database engineering',\n",
       "  'databases',\n",
       "  'my sql',\n",
       "  'mysql database',\n",
       "  'mysql databases',\n",
       "  'mysql oracle',\n",
       "  'mysql postgres',\n",
       "  'mysql postgresql',\n",
       "  'postgres',\n",
       "  'postgresql',\n",
       "  'rdms',\n",
       "  'relational databases sql',\n",
       "  'sql databases'},\n",
       " {'auditor cisa',\n",
       "  'casp',\n",
       "  'ccna security',\n",
       "  'ccnp security',\n",
       "  'ccsp',\n",
       "  'ceh',\n",
       "  'certification cissp',\n",
       "  'certifications cissp',\n",
       "  'certified ethical hacker',\n",
       "  'certified information',\n",
       "  'certified information security manager',\n",
       "  'certified information systems auditor',\n",
       "  'certified information systems security',\n",
       "  'cism',\n",
       "  'cism cisa',\n",
       "  'cism cissp',\n",
       "  'cissp certified information',\n",
       "  'cissp cisa',\n",
       "  'cissp giac',\n",
       "  'gsec',\n",
       "  'isc2',\n",
       "  'oscp',\n",
       "  'sans giac',\n",
       "  'security+ ce',\n",
       "  'sscp'},\n",
       " {'latest versions', 'manager oracle', 'oracle ibm', 'pack', 'product suite'},\n",
       " {'computer security incident',\n",
       "  'emergency response',\n",
       "  'incident response team',\n",
       "  'preparedness',\n",
       "  'response team'},\n",
       " {'functional specs', 'spec'},\n",
       " {'active directory dns',\n",
       "  'hyperv',\n",
       "  'microsoft hyper v',\n",
       "  'networking storage',\n",
       "  'sccm scom',\n",
       "  'server build',\n",
       "  'server builds',\n",
       "  'server vmware',\n",
       "  'software stack',\n",
       "  'vmware environment',\n",
       "  'vmware esx',\n",
       "  'windows server administration'},\n",
       " {'coordinate', 'oversee', 'plan coordinate', 'support ongoing'},\n",
       " {'application vulnerability',\n",
       "  'assessment',\n",
       "  'assessment reports',\n",
       "  'assessments',\n",
       "  'conducting security',\n",
       "  'continuous monitoring',\n",
       "  'identify security',\n",
       "  'network vulnerability',\n",
       "  'penetration tests',\n",
       "  'perform network',\n",
       "  'perform periodic',\n",
       "  'perform risk',\n",
       "  'perform security',\n",
       "  'perform vulnerability',\n",
       "  'performing security',\n",
       "  'performing vulnerability',\n",
       "  'poa&m',\n",
       "  'policy compliance',\n",
       "  'remediating',\n",
       "  'remediation',\n",
       "  'remediation activities',\n",
       "  'remediation efforts',\n",
       "  'review security',\n",
       "  'scanning',\n",
       "  'scans',\n",
       "  'security analysis',\n",
       "  'security assessment',\n",
       "  'security assessments',\n",
       "  'security audits',\n",
       "  'security issues',\n",
       "  'security vulnerability',\n",
       "  'support security',\n",
       "  'vulnerability assessments',\n",
       "  'vulnerability scanning',\n",
       "  'vulnerability scans',\n",
       "  'vulnerability testing'},\n",
       " {'applications utilizing',\n",
       "  'oracle application developer',\n",
       "  'testing maintenance'}]"
      ]
     },
     "execution_count": 102,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "lbl2cluster.values()[0:100]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Dump Clusters to File for Later Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 106,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "with open(CLUSTERS_FILE, \"w+\") as f:\n",
    "    for lbl, words in lbl2cluster.items():\n",
    "        f.write(str(lbl) + \"|\")\n",
    "        line = \",\".join(sorted(words))\n",
    "        f.write(line + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
